package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.analyse.ParseState;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;
/**
 *
 * @author axi
 *
 */
public class _66163NewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		boolean bRet=false;
		String[] regex={
				"http://.*.66163.com/[0-9]*-[0-9]*-[0-9]*/[0-9]*.shtml",
				"http://auto.66163.com/news/storys_[0-9]+.html",
				"http://auto.66163.com/news/[0-9]{8}/storys_[0-9]+.html",
				"http://finance.66163.com/[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}/[0-9]+.shtml",
				"http://fc.66163.com/[0-9]+.html",
//				"http://jk.66163.com/info.php[?]cid[=][0-9]+",
//				"http://finance.66163.com/[0-9]/[0-9]+b.shtml"
				};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				bRet=true;
				break;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		String url=urlMeta.getUrl();
		/**
		 * 判断是否为详细页，不是详细页面直接返回空
		 */
		String htmltxt=urlMeta.getHtml();
		String title="";
		String content="";
		Long date=null;
		Integer commentNum=0;
		Integer clickNum=0;
		NewsMeta news=new NewsMeta();
		Document doc=Jsoup.parse(htmltxt);
		if(url.indexOf("auto.66163.com") != -1){
			Elements select=doc.select("div.ina_news_text");
			if(select.size() > 0){
				/**
				 * 解析标题
				 */
				Elements selectTitle=select.select("h1 > p");
				if(selectTitle.size() > 0){
					title=selectTitle.text();
				}
				/**
				 * 解析时间
				 */
				Elements selectDate=select.select("h1 > span");
				if(selectDate.size() > 0){
					Element element=selectDate.get(1);
					date=DateUtils.matchDate(element.toString());
				}
				/**
				 * 解析内容
				 */
				Elements selectContent=select.select("div.ina_news_pic_text");
				if(selectContent.size() > 0){
					content=HtmlCleaner.getContentHtml(url,selectContent);
				}
			}
		}else if(url.indexOf("fc.66163.com") != -1 || url.indexOf("jk.66163.com") != -1){
			Elements select=doc.select("div.jklist_zc");
			if(select.size() > 0){
				/**
				 * 解析标题
				 */
				Elements selectTitle=select.select("h2");
				if(selectTitle.size() > 0){
					title=selectTitle.text();
				}
				/**
				 * 解析时间
				 */
				Elements selectDate=select.select("h3");
				if(selectDate.size() > 0){
					date=DateUtils.matchDate(selectDate.toString());
				}
				/**
				 * 解析内容
				 */
				Elements selectContent=select.select("div.jklist_zcont");
				if(selectContent.size() > 0){
					content=HtmlCleaner.getContentHtml(url,selectContent);
				}
			}
		}else if(url.indexOf("http://it.66163.com") != -1){
			/**
			 * 解析标题
			 */
			title=doc.select("h1").text();
			/**
			 * 解析时间
			 */
			date=DateUtils.matchDate(doc.select("span.zecc").toString());
			/**
			 * 解析内容
			 */
			Elements selectContent=doc.select("div.nry_zw");
			if(selectContent.size() > 0){
				content=HtmlCleaner.getContentHtml(url,selectContent);
			}
		}else{
			/**
			 * 解析标题
			 */
			Elements selectTitle=doc.select("div#ArticleTit");
			if(selectTitle.size() > 0){
				title=selectTitle.text();
			}
			/**
			 * 解析时间
			 */
			Elements selectDate=doc.select("div#tima");
			if(selectDate.size() > 0){
				date=DateUtils.matchDate(selectDate.toString());
			}
			/**
			 * 解析内容
			 */
			Elements selectContent=doc.select("div#wz");
			if(selectContent.size() > 0){
				content =HtmlCleaner.getContentHtml(url, selectContent);
			}
		}
		if(StringUtils.isBlank(title)||date==null){
			if(doc.select("div.adiv:contains(您访问的链接不存在!)").size()>0){
				news.setUpdateUrl(ParseState.ERR404.toString());
				return news;
			}
		}
		String author=null;
		author=JSoupUtils.matchAuthor(doc, "来源：");

		news.setTitle(title);
		news.setContent(content);
		news.setDate(date);
		news.setUrl(url);
		news.setType(0);
		news.setCommentNum(commentNum);
		news.setClickNum(clickNum);

		news.setAuthor(author);
		return news;
	}
	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		_66163NewsAnalyse s=new _66163NewsAnalyse();
//		String url="http://news.66163.com/2012-11-23/706679.shtml";
//		String url="http://auto.66163.com/news/storys_19792.html";
//		String url="http://finance.66163.com/2012-11-22/706429.shtml";
//		String url="http://fc.66163.com/404875.html";
//		String url="http://jk.66163.com/info.php?cid=404303";
		String url="http://finance.66163.com/2015-06-29/1028662.shtml";
		System.out.println(s.isDetailPage(url));
		UrlMeta meta=CrawlHTML.responseToURL(url);
		NewsMeta parserHtml=s.parserHtml(meta);
		System.out.println(parserHtml);
	}

	
	public boolean isNeedUpdate(){
		return false;
	}
}
